library(tidyverse); library(cowplot)## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.3 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(phyloseq); library(decontam)
library(compositions); library(patchwork); library(ggupset)## Welcome to compositions, a package for compositional data analysis.
## Find an intro with "? compositions"
##
## Attaching package: 'compositions'
## The following objects are masked from 'package:stats':
##
## anova, cor, cov, dist, var
## The following objects are masked from 'package:base':
##
## %*%, norm, scale, scale.default
##
## Attaching package: 'patchwork'
## The following object is masked from 'package:cowplot':
##
## align_plots
Datasets included: * Gorda Ridge 2019 cruise * Axial seamount timeseries * Mid-Cayman Rise 2020 cruisee
merged_tax <- read_delim("data-input/taxonomy.tsv", delim = "\t")
merged_asv <- read_delim("data-input/microeuk-merged-asv-table.tsv", delim = "\t", skip = 1)
# head(merged_tax)metadata <- read.delim("data-input/samplelist-metadata.txt")
# head(metadata)asv_wtax <- merged_asv %>%
select(FeatureID = '#OTU ID', everything()) %>%
pivot_longer(cols = !FeatureID,
names_to = "SAMPLE", values_to = "value") %>%
left_join(merged_tax, by = c("FeatureID" = "Feature ID")) %>%
left_join(metadata) %>%
filter(!grepl("Siders_", SAMPLE)) %>%
filter(SAMPLETYPE != "Incubation") %>%
filter(SAMPLETYPE != "Microcolonizer") %>%
mutate(DATASET = case_when(
grepl("_GR_", SAMPLE) ~ "GR",
grepl("Gorda", SAMPLE) ~ "GR",
grepl("_MCR_", SAMPLE) ~ "MCR",
grepl("Axial", SAMPLE) ~ "Axial",
TRUE ~ "Control or blank")) %>%
separate(Taxon, c("Domain", "Supergroup",
"Phylum", "Class", "Order",
"Family", "Genus", "Species"), sep = ";", remove = FALSE) %>%
unite(SAMPLENAME, SITE, SAMPLETYPE, YEAR, VENT, SAMPLEID, sep = " ", remove = FALSE)## Joining, by = "SAMPLE"
## Warning: Expected 8 pieces. Additional pieces discarded in 154116 rows [163,
## 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179,
## 180, 181, 182, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 140238 rows [1,
## 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
# View(asv_wtax)
# head(asv_wtax) ## Complete ASV table with full taxonomy names and annotated sample informationBarplots to show total number of sequences and total number of ASVs
# head(asv_wtax)
plot_grid(
# Total number of ASVs
asv_wtax %>%
filter(value > 0) %>%
ggplot(aes(x = SAMPLENAME)) +
geom_bar(stat = "count", width = 0.9) +
labs(y = "Total ASVs per sample", x = "") +
coord_flip() +
scale_y_continuous(position = "right") +
theme_linedraw() +
facet_grid(DATASET ~ ., scale = "free", space = "free") +
theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 1),
axis.text.y = element_text(angle = 0, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black")),
asv_wtax %>%
group_by(SAMPLENAME, SITE, Domain, DATASET) %>%
summarise(SUM_SEQ_DOMAIN = sum(value)) %>%
ggplot(aes(x = SAMPLENAME, y = SUM_SEQ_DOMAIN, fill = Domain)) +
geom_bar(stat = "identity", color = "black", width = 0.9) +
labs(y = "Total sequences per sample", x = "") +
coord_flip() +
scale_y_continuous(position = "right") +
theme_linedraw() +
facet_grid(DATASET ~ ., scale = "free", space = "free") +
theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 1),
axis.text.y = element_text(angle = 0, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black"),
legend.position = "right"),
ncol = 2, align = c("hv"), axis = c("lr"))## `summarise()` has grouped output by 'SAMPLENAME', 'SITE', 'Domain'. You can override using the `.groups` argument.
# View(asv_wtax %>% filter(value > 0) %>%
# group_by(SAMPLENAME, DATASET, SITE) %>%
# summarise(SEQ_SUM = sum(value),
# ASV_COUNT = n()))Import sample description text file, import as phyloseq library, and remove potential contaminate ASVs and sequences. Catalog total number of ASVs and sequences removed from analysis.
# library(decontam); library(phyloseq)tax_matrix <- merged_tax %>%
select(FeatureID = `Feature ID`, Taxon) %>%
separate(Taxon, c("Domain", "Supergroup",
"Phylum", "Class", "Order",
"Family", "Genus", "Species"), sep = ";", remove = FALSE) %>%
column_to_rownames(var = "FeatureID") %>%
as.matrix## Warning: Expected 8 pieces. Additional pieces discarded in 2854 rows [2, 7, 8,
## 11, 14, 15, 16, 17, 18, 19, 20, 27, 28, 29, 32, 33, 34, 35, 36, 37, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 2597 rows [1, 3,
## 4, 5, 6, 9, 10, 12, 13, 21, 22, 23, 24, 25, 26, 30, 31, 39, 40, 42, ...].
asv_matrix <- merged_asv %>%
select(FeatureID = '#OTU ID', everything()) %>%
column_to_rownames(var = "FeatureID") %>%
as.matrix
# Align row names for each matrix
rownames(tax_matrix) <- row.names(asv_matrix)
# Set rownames of metadata table to SAMPLE information
row.names(metadata) <- metadata$SAMPLE# Import asv and tax matrices
ASV = otu_table(asv_matrix, taxa_are_rows = TRUE)
TAX = tax_table(tax_matrix)
phylo_obj <- phyloseq(ASV, TAX)
# Import metadata as sample data in phyloseq
samplenames <- sample_data(metadata)
# join as phyloseq object
physeq_wnames = merge_phyloseq(phylo_obj, samplenames)
# colnames(ASV)
## Check
# physeq_wnames# When "Control" appears in "Sample_or_Control column, this is a negative control"
sample_data(physeq_wnames)$is.neg <- sample_data(physeq_wnames)$Sample_or_Control == "Control"# ID contaminants using Prevalence information
contam_prev <- isContaminant(physeq_wnames,
method="prevalence",
neg="is.neg",
threshold = 0.5, normalize = TRUE) ## Warning in .is_contaminant(seqtab, conc = conc, neg = neg, method = method, :
## Removed 2 samples with zero total counts (or frequency).
## Warning in .is_contaminant(seqtab, conc = conc, neg = neg, method = method, :
## Removed 2 samples with zero total counts (or frequency).
# Report number of ASVs IDed as contamintants
table(contam_prev$contaminant)##
## FALSE TRUE
## 5412 39
0.5 - this threshold will ID contaminants in all samples that are more prevalent in negative controls than in positive samples.
# Subset contaminant ASVs
contams <- filter(contam_prev, contaminant == "TRUE")
list_of_contam_asvs <- as.character(row.names(contams))
# length(list_of_contam_asvs)
taxa_contam <- as.data.frame(tax_matrix) %>%
rownames_to_column(var = "FeatureID") %>%
filter(FeatureID %in% list_of_contam_asvs)
# head(taxa_contam)# View(asv_wtax)
asv_wtax_decon <- asv_wtax %>%
filter(!(FeatureID %in% list_of_contam_asvs)) %>%
filter(!(Sample_or_Control == "Control"))
tmp_orig <- (asv_wtax %>% filter(!(Sample_or_Control == "Control")))
# Stats on lost
x <- length(unique(tmp_orig$FeatureID)); x## [1] 5451
y <- length(unique(asv_wtax_decon$FeatureID)); y## [1] 5412
100*((y-x)/x) # 42 total ASVs lost## [1] -0.7154651
a <- sum(tmp_orig$value);a #4.55 million## [1] 2101838
b <- sum(asv_wtax_decon$value);b #4.14 million ## [1] 2044985
100*((b-a)/a)## [1] -2.704918
# Lost 9% of sequences from whole dataset.
## Subsample to clean ASVs
asv_wtax_wstats <- asv_wtax %>%
mutate(DECONTAM = case_when(
FeatureID %in% list_of_contam_asvs ~ "FAIL",
TRUE ~ "PASS"
))plot_grid(asv_wtax_wstats %>%
filter(value > 0) %>%
ggplot(aes(x = SAMPLE, fill = DECONTAM)) +
geom_bar(stat = "count", width = 0.9, color = "black") +
labs(y = "Total ASVs") +
coord_flip() +
theme_linedraw() +
facet_grid(DATASET ~ ., scale = "free", space = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black"),
legend.position = "bottom"),
asv_wtax_wstats %>%
group_by(SAMPLE, SITE, DECONTAM, DATASET) %>%
summarise(SUM_SEQ_DOMAIN = sum(value)) %>%
ggplot(aes(x = SAMPLE, y = SUM_SEQ_DOMAIN, fill = DECONTAM)) +
geom_bar(stat = "identity", color = "black", width = 0.9) +
labs(y = "Total Sequences") +
coord_flip() +
theme_linedraw() +
facet_grid(DATASET ~ ., scale = "free", space = "free") +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black"),
legend.position = "bottom"),
ncol = 2)## `summarise()` has grouped output by 'SAMPLE', 'SITE', 'DECONTAM'. You can override using the `.groups` argument.
colnames(asv_wtax_wstats)## [1] "FeatureID" "SAMPLE" "value"
## [4] "Taxon" "Domain" "Supergroup"
## [7] "Phylum" "Class" "Order"
## [10] "Family" "Genus" "Species"
## [13] "Consensus" "SAMPLENAME" "VENT"
## [16] "COORDINATES" "SITE" "Sample_or_Control"
## [19] "SAMPLEID" "DEPTH" "SAMPLETYPE"
## [22] "YEAR" "TEMP" "pH"
## [25] "PercSeawater" "Mg" "NO3"
## [28] "H2" "H2S" "CH4"
## [31] "ProkConc" "Sample_actual" "Type"
## [34] "DATASET" "DECONTAM"
sites <- c("Piccard", "VonDamm", "Axial", "GordaRidge")
asv_insitu <- asv_wtax_wstats %>% filter(Sample_or_Control != "Control") %>%
filter(SITE %in% sites) %>%
filter(!grepl("_expTf_", SAMPLE)) %>%
filter(value > 0) %>%
filter(DECONTAM == "PASS")
# Get quick stats on totals
sum(asv_insitu$value) # 2million sequences## [1] 2044985
length(unique(asv_insitu$FeatureID)) #2923 ASVs## [1] 2925
Additional sample QC, check replicates, and determine if replicates should be averaged.
plot_grid(asv_insitu %>%
group_by(SAMPLENAME, VENT, DATASET, Domain) %>%
summarise(seqsum_var = sum(value),
asvcount_var = n()) %>%
pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>%
ggplot(aes(x = SAMPLENAME, y = value, fill = Domain)) +
geom_bar(color = "black", stat = "identity", position = "fill") +
facet_grid(VARIABLE ~ DATASET, space = "free", scales = "free") +
scale_y_continuous(expand = c(0,0)) +
theme_linedraw() +
scale_fill_brewer(palette = "Paired") +
theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom"),
asv_insitu %>%
group_by(SAMPLENAME, VENT, DATASET, Domain) %>%
summarise(seqsum_var = sum(value),
asvcount_var = n()) %>%
pivot_longer(ends_with("_var"), names_to = "VARIABLE") %>%
ggplot(aes(x = SAMPLENAME, y = value, fill = Domain)) +
geom_bar(color = "black", stat = "identity", position = "stack") +
facet_grid(VARIABLE ~ DATASET, space = "free_x", scales = "free") +
scale_y_continuous(expand = c(0,0)) +
theme_linedraw() +
scale_fill_brewer(palette = "Paired") +
theme(strip.background = element_blank(), strip.text = element_text(color = "black"),
axis.text.x = element_text(color = "black", angle = 90, hjust = 1, vjust = 0.5),
legend.position = "bottom"),
ncol = 2)## `summarise()` has grouped output by 'SAMPLENAME', 'VENT', 'DATASET'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME', 'VENT', 'DATASET'. You can override using the `.groups` argument.
asv_insitu %>%
filter(Domain == "Eukaryota") %>%
# unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>%
mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>%
group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>%
summarise(SEQ_AVG_REP = mean(value)) %>%
ungroup() %>%
group_by(SITE, SAMPLETYPE, VENT, Supergroup) %>%
summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>%
ggplot(aes(x = VENT, y = SEQ_SUM, fill = Supergroup)) +
geom_bar(stat = "identity", position = "stack", color = "black", width = 0.9) +
facet_grid(. ~ SITE + SAMPLETYPE, scale = "free", space = "free") +
theme_linedraw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black")) +
scale_y_continuous(expand = c(0,0)) +
scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c",
"#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45",
"#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc",
"#807dba", "#54278f", "#bdbdbd", "black"))## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT'. You can override using the `.groups` argument.
Repeat taxonomy barplot, but with relative abundance
asv_insitu %>%
filter(Domain == "Eukaryota") %>%
# unite(SampleIdentifier, VENT, SAMPLETYPE, sep = " ", remove = FALSE) %>%
mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>%
group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>%
summarise(SEQ_AVG_REP = mean(value)) %>%
ungroup() %>%
group_by(SITE, SAMPLETYPE, VENT, Supergroup) %>%
summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>%
ggplot(aes(x = VENT, y = SEQ_SUM, fill = Supergroup)) +
geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
facet_grid(. ~ SITE + SAMPLETYPE, scale = "free", space = "free") +
theme_linedraw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black")) +
scale_y_continuous(expand = c(0,0)) +
scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c",
"#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45",
"#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc",
"#807dba", "#54278f", "#bdbdbd", "black"))## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT'. You can override using the `.groups` argument.
# head(asv_insitu)
# View(asv_insitu)
# length(unique(asv_insitu$FeatureID))
# View(asv_insitu %>% filter(value > 0) %>%
# group_by(SAMPLENAME, DATASET, SITE) %>%
# summarise(SEQ_SUM = sum(value),
# ASV_COUNT = n_distinct(FeatureID)))DivNet package - diversity estimation hypothesis testing from Amy Willis’s group. This will also characterize the uncertainty of the richness estimate. Richness estimation is flawed because of sample depth and processing methods.
library(phyloseq); library(breakaway); library(DivNet)
# Select eukaryotes only and create wide format dataframe
insitu_wide <- asv_insitu %>%
filter(Domain == "Eukaryota") %>%
filter(!grepl("_Plume001_", SAMPLE)) %>% #removing "near vent background", not relevant in other data sets
select(FeatureID, Taxon, SAMPLE, value) %>%
pivot_wider(names_from = SAMPLE, values_from = value, values_fill = 0)
# head(insitu_wide)
insitu_samples <- as.character(colnames(insitu_wide %>% select(-Taxon, -FeatureID)))
# insitu_samplesinsitu_tax_matrix <- insitu_wide %>%
select(FeatureID, Taxon) %>%
separate(Taxon, c("Domain", "Supergroup",
"Phylum", "Class", "Order",
"Family", "Genus", "Species"), sep = ";") %>%
column_to_rownames(var = "FeatureID") %>%
as.matrix## Warning: Expected 8 pieces. Additional pieces discarded in 1600 rows [2, 3, 4,
## 6, 9, 10, 11, 12, 13, 14, 17, 19, 20, 21, 22, 23, 26, 30, 32, 36, ...].
## Warning: Expected 8 pieces. Missing pieces filled with `NA` in 937 rows [1, 5,
## 7, 8, 15, 16, 18, 24, 25, 27, 28, 29, 31, 33, 34, 35, 37, 45, 47, 48, ...].
insitu_asv_matrix <- insitu_wide %>%
select(-Taxon) %>%
column_to_rownames(var = "FeatureID") %>%
as.matrix
# Align row names for each matrix
rownames(insitu_tax_matrix) <- row.names(insitu_asv_matrix)
## Extract relevant metadata information
# head(metadata)
metadata_insitu <- metadata %>%
filter(SAMPLE %in% insitu_samples) %>% # from reformatting df above
select(SAMPLE, VENT, SITE, SAMPLETYPE, YEAR) %>%
unite(SAMPLELABEL, VENT, SITE, SAMPLETYPE, YEAR, sep = "_", remove = FALSE) %>%
unite(TYPE_SITE, SITE, SAMPLETYPE, sep = "_", remove = FALSE)
# View(metadata_insitu)# Import asv and tax matrices
ASV = otu_table(insitu_asv_matrix, taxa_are_rows = TRUE)
TAX = tax_table(insitu_tax_matrix)
phylo_obj <- phyloseq(ASV, TAX)
# Import metadata as sample data in phyloseq
samplenames <- sample_data(metadata_insitu)
# View(samplenames)
# join as phyloseq object
physeq_insitu = merge_phyloseq(phylo_obj, samplenames)
## Check
physeq_insitu## phyloseq-class experiment-level object
## otu_table() OTU Table: [ 2537 taxa and 44 samples ]
## sample_data() Sample Data: [ 44 samples by 7 sample variables ]
## tax_table() Taxonomy Table: [ 2537 taxa by 8 taxonomic ranks ]
# head(tax_matrix[1:2,])Note options to apply divnet to
# Glom tax levels at the Order level, then perform divnet analysis
# order_divnet <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), base = 3)
# order_divnet <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), X = "SAMPLELABEL", base = 3)
# order_divnet_TYPE <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), X = "SAMPLETYPE", base = 3)
# order_divnet_TYPE_SITE <- divnet(tax_glom(physeq_insitu, taxrank = "Order"), X = "TYPE_SITE", base = 3)
# fam_divnet <- divnet(tax_glom(physeq_insitu, taxrank = "Family"), base = 3)# head(metadata_insitu)
# fxn_extract_divet <- function(df){
# df$shannon %>% summary %>%
# pivot_longer(cols = starts_with("estimate"), names_to = "ESTIMATE-shannon", values_to = "Shannon") %>%
# pivot_longer(cols = starts_with("error"), names_to = "ERROR-shannon", values_to = "Shannon-error") %>%
# pivot_longer(cols = starts_with("lower"), names_to = "LOWER-shannon", values_to = "Shannon-lower") %>%
# pivot_longer(cols = starts_with("upper"), names_to = "UPPER-shannon", values_to = "Shannon-upper") %>%
# left_join(df$simpson %>% summary %>%
# pivot_longer(cols = starts_with("estimate"), names_to = "ESTIMATE-simpson", values_to = "Simpson") %>%
# pivot_longer(cols = starts_with("error"), names_to = "ERROR-simpson", values_to = "Simpson-error") %>%
# pivot_longer(cols = starts_with("lower"), names_to = "LOWER-simpson", values_to = "Simpson-lower") %>%
# pivot_longer(cols = starts_with("upper"), names_to = "UPPER-simpson", values_to = "Simpson-upper"),
# by = c("sample_names" = "sample_names")) %>%
# left_join(metadata_insitu %>% rownames_to_column(var = "sample_names")) %>%
# select(-sample_names, -ends_with("-simpson"), -ends_with("-shannon"), -starts_with("model."), -starts_with("name.")) %>%
# distinct()
# }
#
# order_alpha_18s <- fxn_extract_divet(order_divnet)
# order_alpha_TYPE <- fxn_extract_divet(order_divnet_TYPE)
# order_alpha_TYPE_SITE <- fxn_extract_divet(order_divnet_TYPE_SITE)# head(order_alpha_18s)
# plot_grid(order_alpha_18s %>%
# # ggplot(aes(x = VENT, y = Shannon)) +
# ggplot(aes(x = SAMPLETYPE, y = Shannon, group = SAMPLETYPE)) +
# # geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
# # geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
# geom_violin(aes(fill = SAMPLETYPE), color = "#525252", alpha = 0.5, width = 0.5, draw_quantiles = c(0.25, 0.5, 0.75)) +
# geom_jitter(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# scale_fill_brewer(palette = "Set2") +
# theme_linedraw() +
# theme(axis.text.y = element_text(size = 14),
# axis.text.x = element_blank(),
# strip.background = element_blank(),
# strip.text = element_text(color = "black"),
# legend.position = "none",
# axis.ticks.x = element_blank()) +
# labs(x = "", y = "Shannon"),
# order_alpha_18s %>%
# # ggplot(aes(x = VENT, y = Simpson)) +
# ggplot(aes(x = SAMPLETYPE, y = Shannon, group = SAMPLETYPE)) +
# # geom_errorbar(aes(ymin = `Simpson-lower`, ymax = `Simpson-upper`), color = "#525252", width = 0.2) +
# # geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
# geom_violin(aes(fill = SAMPLETYPE), color = "#525252", alpha = 0.5, width = 0.5, draw_quantiles = c(0.25, 0.5, 0.75)) +
# geom_jitter(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# scale_fill_brewer(palette = "Set2") +
# theme_linedraw() +
# theme(axis.text.x = element_text(vjust = 1, hjust = 0.5, size = 14),
# axis.text = element_text(size = 14),
# strip.background = element_blank(),
# strip.text = element_blank(),
# legend.title = element_blank(),
# legend.position = "bottom") +
# labs(x = "", y = "Simpson"),
# ncol = 1, axis = c("lrt"), align = c("vh"))Save for presentation
# svg("Shannon-violin-plot.svg",)
# order_alpha_18s %>%
# # ggplot(aes(x = VENT, y = Shannon)) +
# ggplot(aes(x = SAMPLETYPE, y = Shannon, group = SAMPLETYPE)) +
# # geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
# # geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
# geom_violin(aes(fill = SAMPLETYPE), color = "#525252", alpha = 0.5, width = 0.5, draw_quantiles = c(0.25, 0.5, 0.75)) +
# geom_jitter(shape = 21, color = "#525252", size = 3, aes(fill = SAMPLETYPE)) +
# # scale_fill_brewer(palette = "Set2") +
# scale_fill_manual(values = c("#1c9099", "#fd8d3c", "#f768a1")) +
# theme_linedraw() +
# theme(axis.text.x = element_text(vjust = 1, hjust = 0.5, size = 14),
# axis.text = element_text(size = 14),
# strip.background = element_blank(),
# strip.text = element_blank(),
# legend.title = element_blank(),
# legend.position = "bottom") +
# labs(x = "", y = "Shannon")
# dev.off()# head(order_alpha_TYPE)
# plot_grid(order_alpha_TYPE %>%
# select(-SAMPLELABEL, -VENT, -SITE, -TYPE_SITE, -YEAR) %>%
# distinct() %>%
# ggplot(aes(x = SAMPLETYPE, y = Shannon)) +
# geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
# geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
# scale_fill_brewer(palette = "Set2") +
# theme_linedraw() +
# theme(axis.text.x = element_blank(),
# strip.background = element_blank(),
# strip.text = element_text(color = "black"),
# legend.position = "none",
# axis.ticks = element_blank()) +
# labs(x = "", y = "Shannon"),
# order_alpha_TYPE %>%
# select(-SAMPLELABEL, -VENT, -SITE, -TYPE_SITE, -YEAR) %>%
# distinct() %>%
# ggplot(aes(x = SAMPLETYPE, y = Simpson)) +
# geom_errorbar(aes(ymin = `Simpson-lower`, ymax = `Simpson-upper`), color = "#525252", width = 0.2) +
# geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# # facet_grid(. ~ SITE + SAMPLETYPE + YEAR, space = "free_x", scales = "free_x") +
# scale_fill_brewer(palette = "Set2") +
# theme_linedraw() +
# theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
# strip.background = element_blank(),
# strip.text = element_blank(),
# legend.title = element_blank(),
# legend.position = "bottom") +
# labs(x = "", y = "Simpson"),
# ncol = 1, axis = c("lr"), align = c("v"))# head(order_alpha_TYPE_SITE)
# plot_grid(order_alpha_TYPE_SITE %>%
# select(-SAMPLELABEL, -YEAR, -VENT) %>%
# distinct() %>%
# ggplot(aes(x = SAMPLETYPE, y = Shannon)) +
# geom_errorbar(aes(ymin = `Shannon-lower`, ymax = `Shannon-upper`), color = "#525252", width = 0.2) +
# geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# facet_grid(. ~ SITE, space = "free_x", scales = "free_x") +
# scale_fill_brewer(palette = "Set2") +
# theme_linedraw() +
# theme(axis.text.x = element_blank(),
# strip.background = element_blank(),
# strip.text = element_text(color = "black"),
# legend.position = "none",
# axis.ticks = element_blank()) +
# labs(x = "", y = "Shannon"),
# order_alpha_TYPE_SITE %>%
# select(-SAMPLELABEL, -YEAR, -VENT) %>%
# distinct() %>%
# ggplot(aes(x = SAMPLETYPE, y = Simpson)) +
# geom_errorbar(aes(ymin = `Simpson-lower`, ymax = `Simpson-upper`), color = "#525252", width = 0.2) +
# geom_point(shape = 21, color = "#525252", size = 2, aes(fill = SAMPLETYPE)) +
# facet_grid(. ~ SITE, space = "free_x", scales = "free_x") +
# scale_fill_brewer(palette = "Set2") +
# theme_linedraw() +
# theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
# strip.background = element_blank(),
# strip.text = element_blank(),
# legend.title = element_blank(),
# legend.position = "bottom") +
# labs(x = "", y = "Simpson"),
# ncol = 1, axis = c("lr"), align = c("v"))Set up analysis to classify each ASV
head(asv_insitu)## # A tibble: 6 × 35
## FeatureID SAMPLE value Taxon Domain Supergroup Phylum Class Order Family Genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0016c920… 70_MC… 45 Euka… Eukar… Rhizaria Cerco… Chlo… Chlo… Chlor… Loth…
## 2 004b2336… 77_MC… 21 Euka… Eukar… Alveolata Dinof… Dino… Dino… Dinop… Dino…
## 3 006f5664… 66_MC… 315 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 4 006f5664… 71_MC… 181 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 5 006f5664… 77_MC… 89 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 6 006f5664… 78_MC… 52 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## # … with 24 more variables: Species <chr>, Consensus <dbl>, SAMPLENAME <chr>,
## # VENT <chr>, COORDINATES <chr>, SITE <chr>, Sample_or_Control <chr>,
## # SAMPLEID <chr>, DEPTH <chr>, SAMPLETYPE <chr>, YEAR <int>, TEMP <dbl>,
## # pH <dbl>, PercSeawater <dbl>, Mg <dbl>, NO3 <dbl>, H2 <dbl>, H2S <dbl>,
## # CH4 <dbl>, ProkConc <dbl>, Sample_actual <chr>, Type <chr>, DATASET <chr>,
## # DECONTAM <chr>
# head(insitu_wide)
# unique(asv_insitu$SAMPLETYPE)
# unique(asv_insitu$SITE)
tax_asv_id <- asv_insitu %>%
filter(value > 0) %>% #remove zero values
select(FeatureID, SITE, SAMPLETYPE) %>% # isolate only ASVs that are PRESENT at a site and sampletype
distinct() %>% #unique this, as presense = present in at least 1 rep (where applicable)
unite(sample_id, SITE, SAMPLETYPE, sep = "_") %>%
# select(-SITE) %>%
# distinct() %>%
add_column(present = 1) %>%
pivot_wider(names_from = sample_id, values_from = present, values_fill = 0) %>%
rowwise() %>%
mutate_at(vars(FeatureID), factor)
#
# mutate(total = sum(c_across(where(!is.na(.))))) %>%
# mutate(FREQ = case_when(
# total == 1 ~ "Appears in only 1 sample type",
# total > 1 ~ "Appears in more than 1 location or sample type"
# ))
# head(tax_asv_id)library(purrr)
any_cols <- function(tax_asv_id) reduce(tax_asv_id, `|`)
asv_class <- tax_asv_id %>%
mutate(vent = ifelse(any_cols(across(contains("_Vent"), ~ . > 0)), "VENT", ""),
plume= ifelse(any_cols(across(contains("_Plume"), ~ . > 0)), "PLUME", ""),
bsw = ifelse(any_cols(across(contains("_Background"), ~ . > 0)), "BSW", ""),
) %>%
unite(class_tmp, vent, plume, bsw, sep = "_", na.rm = TRUE) %>%
mutate(CLASS = case_when(
class_tmp == "VENT__" ~ "Vent only",
class_tmp == "_PLUME_" ~ "Plume only",
class_tmp == "__BSW" ~ "Background only",
class_tmp == "VENT__BSW" ~ "Vent & background",
class_tmp == "VENT_PLUME_BSW" ~ "Vent, plume, & background",
class_tmp == "VENT_PLUME_" ~ "Vent & plume",
class_tmp == "_PLUME_BSW" ~ "Plume & background"
)) %>%
select(FeatureID, CLASS) %>% distinct()
colnames(tax_asv_id)## [1] "FeatureID" "VonDamm_Vent" "Piccard_Vent"
## [4] "Piccard_Background" "VonDamm_Plume" "Axial_Vent"
## [7] "VonDamm_Background" "Piccard_Plume" "GordaRidge_Vent"
## [10] "GordaRidge_Background" "GordaRidge_Plume" "Axial_Background"
## [13] "Axial_Plume"
head(tax_asv_id)## # A tibble: 6 × 13
## # Rowwise:
## FeatureID VonDamm_Vent Piccard_Vent Piccard_Backgro… VonDamm_Plume Axial_Vent
## <fct> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0016c9209… 1 0 0 0 0
## 2 004b23363… 0 1 0 0 0
## 3 006f5664d… 1 1 0 0 0
## 4 007322d9b… 0 1 0 0 0
## 5 008783681… 1 1 0 0 0
## 6 0090723b7… 1 0 0 0 0
## # … with 7 more variables: VonDamm_Background <dbl>, Piccard_Plume <dbl>,
## # GordaRidge_Vent <dbl>, GordaRidge_Background <dbl>, GordaRidge_Plume <dbl>,
## # Axial_Background <dbl>, Axial_Plume <dbl>
asv_class_SITE <- tax_asv_id %>%
mutate(mcr = ifelse(any_cols(across(contains("Piccard") | contains("VonDamm"), ~ . > 0)), "MCR", ""),
axial = ifelse(any_cols(across(contains("Axial"), ~ . > 0)), "AxS", ""),
gr = ifelse(any_cols(across(contains("Gorda"), ~ . > 0)), "GR", "")
) %>%
unite(class_tmp, mcr, axial, gr, sep = "_", na.rm = TRUE) %>%
mutate(SITE_CLASS = case_when(
class_tmp == "__GR" ~ "Gorda Ridge only",
class_tmp == "_AxS_" ~ "Axial only",
class_tmp == "_AxS_GR" ~ "Axial & Gorda Ridge",
class_tmp == "MCR__" ~ "Mid-Cayman Rise",
class_tmp == "MCR__GR" ~ "Mid-Cayman Rise & Gorda Ridge",
class_tmp == "MCR_AxS_" ~ "Mid-Cayman Rise & Axial",
class_tmp == "MCR_AxS_GR" ~ "All sites"
)) %>%
select(FeatureID, SITE_CLASS) %>% distinct()insitu_asv_wClass <- asv_insitu %>%
left_join(asv_class) %>%
left_join(asv_class_SITE)## Joining, by = "FeatureID"
## Joining, by = "FeatureID"
# head(insitu_asv_wClass)Bubble plot
# head(asv_insitu)
# svg("bubbles.svg", h = 4, w = 8)
asv_insitu %>%
select(DATASET, FeatureID, SAMPLETYPE) %>%
group_by(DATASET, SAMPLETYPE) %>%
summarise(COUNT = n_distinct(FeatureID)) %>%
ggplot(aes(x = DATASET, y = SAMPLETYPE, fill = SAMPLETYPE)) +
geom_point(aes(size = COUNT), shape = 21, color = "black") +
scale_size_continuous(range = c(4,20)) +
scale_fill_viridis_d(option = "B") +
theme_void() +
theme(legend.position = "right",
axis.text = element_text(color = "black"))## `summarise()` has grouped output by 'DATASET'. You can override using the `.groups` argument.
# dev.off()unique(insitu_asv_wClass$CLASS)## [1] "Vent only" "Vent & background"
## [3] "Plume only" "Background only"
## [5] "Vent, plume, & background" "Vent & plume"
## [7] "Plume & background"
unique(insitu_asv_wClass$SITE_CLASS)## [1] "Mid-Cayman Rise" "Axial only"
## [3] "Gorda Ridge only" "Mid-Cayman Rise & Gorda Ridge"
## [5] "Axial & Gorda Ridge" "Mid-Cayman Rise & Axial"
## [7] "All sites"
To explore microbial eukaryotic community diversity at all three sites, below functions have been written to pass 18S data for each site through the same analysis. This will be done for all sites together and for them individually.
Sections below highlight Axial Seamount, Mid-Cayman Rise, and Gorda Ridge data individually.
axial <- c("Axial")
mcr <- c("VonDamm", "Piccard")
gr <- c("GordaRidge")Create a bar plot showing the relative sequence abundance of 18S results to the Supergroup and Phylum level. Function averages across replicates and then sums to the phylum and supergroup level. Bar plot shows the relative sequence abundance.
make_bar_relabun <- function(df, selection){
df_out <- df %>%
filter(SITE %in% selection) %>%
filter(Domain == "Eukaryota") %>%
mutate(Supergroup = ifelse(is.na(Supergroup), "Unknown Eukaryota", Supergroup),
Phylum = ifelse(is.na(Phylum), "Unknown", Phylum),
Phylum = ifelse(Phylum == "Alveolata_X", "Ellobiopsidae", Phylum),
Supergroup = ifelse(Supergroup == "Alveolata", paste(Supergroup, Phylum, sep = "-"), Supergroup)) %>%
group_by(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species,
VENT, SITE, SAMPLETYPE, YEAR, DATASET) %>%
summarise(SEQ_AVG_REP = mean(value)) %>%
ungroup()
supergroup <- df_out %>%
group_by(SITE, SAMPLETYPE, VENT, YEAR, Supergroup) %>%
summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>%
ggplot(aes(x = VENT, y = SEQ_SUM, fill = Supergroup)) +
geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
facet_grid(. ~ SITE +YEAR + SAMPLETYPE, scale = "free", space = "free") +
theme_linedraw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black"),
legend.position = "right") +
scale_y_continuous(expand = c(0,0)) +
# scale_fill_brewer(palette = "Set2") +
scale_fill_manual(values = c("#fa9fb5", "#c51b8a", "#67000d", "#ef3b2c", "#ffffcc", "#feb24c", "#c7e9b4", "#1d91c0", "#253494", "#9e9ac8", "#238b45", "#54278f", "#bdbdbd", "#252525")) +
labs(x = "", y = "Relative abundance")
phylum <- df_out %>%
unite(SupergroupPhylum, Supergroup, Phylum, sep = "-") %>%
group_by(SITE, SAMPLETYPE, VENT, YEAR, SupergroupPhylum) %>%
summarise(SEQ_SUM = sum(SEQ_AVG_REP)) %>%
ggplot(aes(x = VENT, y = SEQ_SUM, fill = SupergroupPhylum)) +
geom_bar(stat = "identity", position = "fill", color = "black", width = 0.9) +
facet_grid(. ~ SITE +YEAR + SAMPLETYPE, scale = "free", space = "free") +
theme_linedraw() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
strip.background = element_blank(), strip.text = element_text(color = "black"),
legend.position = "right") +
scale_y_continuous(expand = c(0,0)) +
scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c",
"#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45",
"#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc",
"#807dba", "#54278f", "#bdbdbd", "black", "white", "#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c",
"#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45")) +
labs(x = "", y = "Relative abundance")
supergroup + phylum + patchwork::plot_layout(ncol = 1)
}
# make_bar_relabun(insitu_asv_wClass, axial)Relative abundance plots are misleading, as this tag-sequence data is compositional. To combat this, we can also perform a center log-ratio transformation of the sequence counts. This tile plot (or heat map) will show the relationship from the data mean. Positive values thus demonstrate an increase in the taxa, while negative values illustrate the opposite.
# Not sure if I need this
tax_key <- insitu_asv_wClass %>%
select(FeatureID, Taxon, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species, CLASS, SITE_CLASS) %>%
distinct()Ahead of the CLR transformation, average across replicates, then sum to the Class level. THEN perform CLR transformation and plot as heat map.
make_clr_trans_tile <- function(df, selection){
df_wide <- df %>%
filter(SITE %in% selection) %>%
# df_wide <- insitu_asv_wClass %>%
# filter(SITE %in% axial) %>%
filter(Domain == "Eukaryota") %>%
# Average across replicates
group_by(FeatureID, SAMPLENAME, VENT, Domain, Supergroup, Phylum, Class, Order, Family, Genus, Species) %>%
summarise(AVG = mean(value)) %>%
ungroup() %>%
# Sum to the Order taxonomic classification
unite(SAMPLENAME_2, SAMPLENAME, VENT, sep = "_") %>%
group_by(SAMPLENAME_2, Supergroup, Phylum, Class) %>%
summarise(CLASS_SUM = sum(AVG)) %>%
unite(CLASS, Supergroup, Phylum, Class, sep = " ") %>%
select(CLASS, SAMPLENAME_2, CLASS_SUM) %>%
pivot_wider(names_from = SAMPLENAME_2, values_from = CLASS_SUM, values_fill = 0) %>%
column_to_rownames(var = "CLASS")
## Take wide data frame and CLR transform, pivot to wide, and plot
data.frame(compositions::clr(df_wide)) %>%
rownames_to_column(var = "CLASS") %>%
pivot_longer(cols = starts_with(selection), values_to = "CLR", names_to = "SAMPLENAME_2") %>%
separate(SAMPLENAME_2, c("SAMPLENAME", "VENT"), sep = "_") %>%
separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>%
mutate(VENT = str_replace_all(VENT, "\\.", " ")) %>%
mutate(REGION = case_when(
SITE == "GordaRidge" ~ "Gorda Ridge",
SITE %in% mcr ~ "Mid-Cayman Rise",
SITE == "Axial" ~ "Axial")) %>%
mutate(VENTNAME = case_when(
REGION == "Gorda Ridge" ~ VENT,
REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
)) %>% select(-Sample_tmp) %>%
unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>%
separate(CLASS, c("Supergroup", "Phylum", "Class"), sep = " ", remove = FALSE) %>%
ggplot(aes(x = SAMPLE, y = Class, fill = CLR)) +
geom_tile(color = "#252525") +
theme(legend.position = "right",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.border = element_blank(),
panel.background = element_blank(),
axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5, color = "black",size = 8),
axis.text.y = element_text(color = "black", size = 8),
strip.background = element_blank(),
strip.text.y = element_text(hjust = 0, vjust = 0.5, angle = 0),
# strip.text.x = element_blank(),
legend.title = element_blank()) +
labs(x = "", y = "") +
scale_fill_gradient2(low = "#4575b4", mid = "white", high = "#d73027", na.value = "grey50") +
facet_grid(Supergroup + Phylum ~ SAMPLETYPE, space = "free", scales = "free")
}Similar to aove, the first step in this function transforms data using CLR (to ASV level though). First plot will show eigen values (scree plot to determine if 2 vs. 3 dimensions is best for data). Then function extracts data points and creates PCA plot.
make_pca <- function(df, selection){
df_wide_asv <- df %>%
# df_wide_asv <- insitu_asv_wClass %>%
filter(SITE %in% selection) %>%
# filter(SITE %in% axial) %>%
filter(Domain == "Eukaryota") %>%
# Average across replicates
group_by(FeatureID, SAMPLENAME, VENT) %>%
summarise(AVG = mean(value)) %>%
ungroup() %>%
separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>%
mutate(REGION = case_when(
SITE == "GordaRidge" ~ "Gorda Ridge",
SITE %in% mcr ~ "Mid-Cayman Rise",
SITE == "Axial" ~ "Axial")) %>%
mutate(VENTNAME = case_when(
REGION == "Gorda Ridge" ~ VENT,
REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
)) %>% select(-Sample_tmp) %>%
unite(SAMPLE, SAMPLETYPE, REGION, VENTNAME, sep = "_", remove = FALSE) %>%
group_by(FeatureID, SAMPLE) %>%
summarise(SUM = sum(AVG)) %>%
pivot_wider(names_from = FeatureID, values_from = SUM, values_fill = 0) %>%
column_to_rownames(var = "SAMPLE")
# look at eigenvalues
pca_lr <- prcomp(data.frame(compositions::clr(df_wide_asv)))
variance_lr <- (pca_lr$sdev^2)/sum(pca_lr$sdev^2)
## View bar plot
barplot(variance_lr, main = "Log-Ratio PCA Screeplot", xlab = "PC Axis", ylab = "% Variance",
cex.names = 1.5, cex.axis = 1.5, cex.lab = 1.5, cex.main = 1.5)
## Extract PCR points
data.frame(pca_lr$x, SAMPLE = rownames(pca_lr$x)) %>%
separate(SAMPLE, c("SAMPLETYPE", "REGION", "VENTNAME"), sep = "_", remove = FALSE) %>%
## Generate PCA plot
ggplot(aes(x = PC1, y = PC2, shape = SAMPLETYPE, fill = VENTNAME)) +
geom_hline(yintercept = 0) + geom_vline(xintercept = 0, color = "#525252") +
geom_point(size=4, stroke = 1, aes(fill = VENTNAME)) +
ylab(paste0('PC2 ',round(variance_lr[2]*100,2),'%')) +
xlab(paste0('PC1 ',round(variance_lr[1]*100,2),'%')) +
scale_shape_manual(values = c(21, 23, 24)) +
# scale_fill_manual(values = fill_color) +
# scale_color_manual(values = color_color) +
theme_bw() +
theme(axis.text = element_text(color="black", size=12),
legend.title = element_blank(),
axis.title = element_text(color="black", size=14),
legend.text = element_text(color = "black", size = 14),
plot.margin = margin(2, 1, 2, 1, "cm")) +
guides(fill = guide_legend(override.aes = list(shape = 21) ),
shape = guide_legend(override.aes = list(fill = "black")))
}From complete dataset, average across replicates, then sum the total number of ASVs in each sample. Then plot a data point for total number of ASVs (ASV richness) by sample type - where sample type represents the vent, plume, vs. background. Box plots show the median and range.
make_asv_rich <- function(df, selection){
df %>%
filter(SITE %in% selection) %>%
filter(Domain == "Eukaryota") %>%
# Average across replicates
group_by(FeatureID, SAMPLENAME, VENT, Supergroup) %>%
summarise(AVG = mean(value)) %>%
separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>%
mutate(REGION = case_when(
SITE == "GordaRidge" ~ "Gorda Ridge",
SITE %in% mcr ~ "Mid-Cayman Rise",
SITE == "Axial" ~ "Axial")) %>%
mutate(VENTNAME = case_when(
REGION == "Gorda Ridge" ~ VENT,
REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
)) %>% select(-Sample_tmp) %>%
unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>%
ungroup() %>%
group_by(REGION, SAMPLE, SAMPLETYPE) %>%
summarise(NUM_ASV = n()) %>%
ggplot(aes(x = SAMPLETYPE, y = NUM_ASV, shape = SAMPLETYPE)) +
geom_boxplot(aes(group = SAMPLETYPE), alpha = 0.8, width = 0.4) +
geom_jitter(size=2, aes(fill = REGION)) +
scale_shape_manual(values = c(21, 23, 24)) +
theme_bw() +
theme(axis.text = element_text(color="black", size=12),
legend.title = element_blank(),
axis.title = element_text(color="black", size=14),
legend.text = element_text(color = "black", size = 14)) +
guides(fill = guide_legend(override.aes = list(shape = 21) ),
shape = guide_legend(override.aes = list(fill = "black") ) ) +
labs(x = "", y = "Total number of ASVs")
}Bar plot (colors correspond to Supergroup) represents the number of ASVs shared or unique to each sample. Combination matrix below bars shows which samples are considered for the bar plot.
make_upset_plot <- function(df, selection){
df %>%
filter(SITE %in% selection) %>%
filter(Domain == "Eukaryota") %>%
# Average across replicates
group_by(FeatureID, SAMPLENAME, VENT, Supergroup) %>%
summarise(AVG = mean(value)) %>%
separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>%
mutate(REGION = case_when(
SITE == "GordaRidge" ~ "Gorda Ridge",
SITE %in% mcr ~ "Mid-Cayman Rise",
SITE == "Axial" ~ "Axial")) %>%
mutate(VENTNAME = case_when(
REGION == "Gorda Ridge" ~ VENT,
REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
)) %>% select(-Sample_tmp) %>%
unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>%
distinct(FeatureID, Supergroup, AVG, SAMPLE, .keep_all = TRUE) %>%
group_by(FeatureID, Supergroup) %>%
summarise(SAMPLE = list(SAMPLE)) %>%
ggplot(aes(x = SAMPLE)) +
geom_bar(color = "black", width = 0.5, aes(fill = Supergroup)) +
scale_x_upset(n_intersections = 25) +
scale_y_continuous(expand = c(0,0)) +
labs(x = "", y = "Shared ASVs") +
theme_linedraw() +
theme(axis.text = element_text(color="black", size=10),
axis.title = element_text(color="black", size=10),
legend.text = element_text(color = "black", size = 10),
plot.margin = margin(1, 1, 1, 5, "cm")) +
scale_fill_manual(values = c("#f1eef6", "#d7b5d8", "#df65b0", "#ce1256", "#fc9272", "#ef3b2c",
"#800026", "#fff7bc", "#fec44f", "#d95f0e", "#a63603", "#74c476", "#238b45",
"#00441b", "#7fcdbb", "#084081", "#c6dbef", "#2b8cbe", "#016c59", "#bcbddc",
"#807dba", "#54278f", "#bdbdbd", "black"))
}make_bar_relabun(insitu_asv_wClass, axial)## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
Axial Seamount samples from archived material - span 2013, 2014, and 2015. First, the background and plume (from 2015 only, and from plume associated with the Anemone vent) are different from the vent samples - overwhelmingly stramneopile and rhizaria. For the background and plume, the stramenopiles appear to be associated with ochrophyta or opalozoa. For the plume, the rhizaria population was associated with cercozoa, while the background seawater was identified as belonging to radiolaria.
The major difference between the background/plume and vent sites was the higher relative sequence abundance of ciliates and opisthokonta. For the opisthokonta, these are primarily metazoa - and I will need to investigate this further. Exceptions for this include the ‘Dependable’ vent from 2013, which had a completely different composition, and ‘Marker 113’ in 2015, which the opisthokonta sequences were assigned choanoflagellate and fungi.
Further questions to consider
Any geochemical changes to Marker 113 from 2013/2014 to 2015? Could attribute difference of opisthokonta colonization.
Need to get metadata
Some of the Axial data had low sequence totals - is the Dependable vent one of these samples?
Yes it is - probably need to remove this sample! ## Tile plot CLR: Axial
make_clr_trans_tile(insitu_asv_wClass, axial)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 770 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
make_pca(insitu_asv_wClass, axial)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 439 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
make_asv_rich(insitu_asv_wClass, axial)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 439 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.
We only have 1 sample for background and plume from Axial Seamount. But this shows that the vent sites have varied ASV richness,
make_upset_plot(insitu_asv_wClass, axial)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 439 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 35 rows containing non-finite values (stat_count).
make_bar_relabun(insitu_asv_wClass, mcr)## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
make_clr_trans_tile(insitu_asv_wClass, mcr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 1378 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
make_pca(insitu_asv_wClass, mcr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 3809 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
make_asv_rich(insitu_asv_wClass, mcr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 3809 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.
make_upset_plot(insitu_asv_wClass, mcr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 3809 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 498 rows containing non-finite values (stat_count).
make_bar_relabun(insitu_asv_wClass, gr)## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
make_clr_trans_tile(insitu_asv_wClass, gr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 918 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
make_pca(insitu_asv_wClass, gr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 401 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
make_asv_rich(insitu_asv_wClass, gr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 401 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.
make_upset_plot(insitu_asv_wClass, gr)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 401 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 16 rows containing non-finite values (stat_count).
all <- c("Axial", "VonDamm", "Piccard", "GordaRidge")
mcr <- c("VonDamm", "Piccard")make_bar_relabun(insitu_asv_wClass, all)## `summarise()` has grouped output by 'FeatureID', 'Taxon', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species', 'VENT', 'SITE', 'SAMPLETYPE', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SITE', 'SAMPLETYPE', 'VENT', 'YEAR'. You can override using the `.groups` argument.
make_clr_trans_tile(insitu_asv_wClass, all)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Domain', 'Supergroup', 'Phylum', 'Class', 'Order', 'Family', 'Genus'. You can override using the `.groups` argument.
## `summarise()` has grouped output by 'SAMPLENAME_2', 'Supergroup', 'Phylum'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 5016 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
make_pca(insitu_asv_wClass, all)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
make_asv_rich(insitu_asv_wClass, all)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'REGION', 'SAMPLE'. You can override using the `.groups` argument.
make_upset_plot(insitu_asv_wClass, all)## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## `summarise()` has grouped output by 'FeatureID'. You can override using the `.groups` argument.
## Warning: Removed 787 rows containing non-finite values (stat_count).
Isolate shared and unique for some combinations, but only a few - as key examples! at the ASV level.
# head(insitu_asv_wClass) # from above, where I've classified each ASV by site and occurence in sample type
unique(insitu_asv_wClass$CLASS)## [1] "Vent only" "Vent & background"
## [3] "Plume only" "Background only"
## [5] "Vent, plume, & background" "Vent & plume"
## [7] "Plume & background"
unique(insitu_asv_wClass$SITE_CLASS)## [1] "Mid-Cayman Rise" "Axial only"
## [3] "Gorda Ridge only" "Mid-Cayman Rise & Gorda Ridge"
## [5] "Axial & Gorda Ridge" "Mid-Cayman Rise & Axial"
## [7] "All sites"
unique(insitu_asv_wClass$SAMPLETYPE)## [1] "Vent" "Background" "Plume"
head(insitu_asv_wClass)## # A tibble: 6 × 37
## FeatureID SAMPLE value Taxon Domain Supergroup Phylum Class Order Family Genus
## <chr> <chr> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 0016c920… 70_MC… 45 Euka… Eukar… Rhizaria Cerco… Chlo… Chlo… Chlor… Loth…
## 2 004b2336… 77_MC… 21 Euka… Eukar… Alveolata Dinof… Dino… Dino… Dinop… Dino…
## 3 006f5664… 66_MC… 315 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 4 006f5664… 71_MC… 181 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 5 006f5664… 77_MC… 89 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## 6 006f5664… 78_MC… 52 Euka… Eukar… Stramenop… Ochro… Chry… Chry… Chrys… Chry…
## # … with 26 more variables: Species <chr>, Consensus <dbl>, SAMPLENAME <chr>,
## # VENT <chr>, COORDINATES <chr>, SITE <chr>, Sample_or_Control <chr>,
## # SAMPLEID <chr>, DEPTH <chr>, SAMPLETYPE <chr>, YEAR <int>, TEMP <dbl>,
## # pH <dbl>, PercSeawater <dbl>, Mg <dbl>, NO3 <dbl>, H2 <dbl>, H2S <dbl>,
## # CH4 <dbl>, ProkConc <dbl>, Sample_actual <chr>, Type <chr>, DATASET <chr>,
## # DECONTAM <chr>, CLASS <chr>, SITE_CLASS <chr>
unique(insitu_asv_wClass$CLASS)## [1] "Vent only" "Vent & background"
## [3] "Plume only" "Background only"
## [5] "Vent, plume, & background" "Vent & plume"
## [7] "Plume & background"
unique(insitu_asv_wClass$SITE_CLASS)## [1] "Mid-Cayman Rise" "Axial only"
## [3] "Gorda Ridge only" "Mid-Cayman Rise & Gorda Ridge"
## [5] "Axial & Gorda Ridge" "Mid-Cayman Rise & Axial"
## [7] "All sites"
unique(insitu_asv_wClass$SAMPLETYPE)## [1] "Vent" "Background" "Plume"
insitu_asv_wClass %>%
# filter(SITE %in% selection) %>%
filter(Domain == "Eukaryota") %>%
# Average across replicates
group_by(FeatureID, SAMPLENAME, VENT, Supergroup, CLASS, SITE_CLASS) %>%
summarise(AVG = mean(value)) %>%
separate(SAMPLENAME, c("SITE", "SAMPLETYPE", "YEAR", "Sample_tmp"), remove = TRUE) %>%
mutate(REGION = case_when(
SITE == "GordaRidge" ~ "Gorda Ridge",
SITE %in% mcr ~ "Mid-Cayman Rise",
SITE == "Axial" ~ "Axial")) %>%
mutate(VENTNAME = case_when(
REGION == "Gorda Ridge" ~ VENT,
REGION == "Mid-Cayman Rise" ~ paste(SITE, VENT, sep = " "),
REGION == "Axial" ~ paste(VENT, YEAR, sep = " ")
)) %>% select(-Sample_tmp) %>%
unite(SAMPLE, REGION, VENTNAME, sep = " ", remove = FALSE) %>% head## `summarise()` has grouped output by 'FeatureID', 'SAMPLENAME', 'VENT', 'Supergroup', 'CLASS'. You can override using the `.groups` argument.
## Warning: Expected 4 pieces. Additional pieces discarded in 4649 rows [1, 2, 3,
## 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
## # A tibble: 6 × 12
## # Groups: FeatureID, VENT, Supergroup, CLASS [6]
## FeatureID SITE SAMPLETYPE YEAR VENT Supergroup CLASS SITE_CLASS AVG
## <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 0016c920953… VonDa… Vent 2020 Rav2 Rhizaria Vent… Mid-Cayma… 45
## 2 004b23363bd… Picca… Vent 2020 Shrimp… Alveolata Vent… Mid-Cayma… 21
## 3 006f5664d25… Picca… Vent 2020 LotsOS… Stramenop… Vent… Mid-Cayma… 52
## 4 006f5664d25… Picca… Vent 2020 Shrimp… Stramenop… Vent… Mid-Cayma… 89
## 5 006f5664d25… VonDa… Vent 2020 ArrowL… Stramenop… Vent… Mid-Cayma… 315
## 6 006f5664d25… VonDa… Vent 2020 OldMan… Stramenop… Vent… Mid-Cayma… 181
## # … with 3 more variables: SAMPLE <chr>, REGION <chr>, VENTNAME <chr>
Compare bubble plots with various metadata parameters.
Import grazing data as output from previous, plot with bubbles underneath of specific parameters.